import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from scipy import stats
import sys
sys.path.append(sys.argv[1])

import pandas as pd
import pickle

atp_tennis = pd.read_csv(os.path.join(sys.argv[1], 'atp_tennis.csv'))

# Data Quality Report
missing_values = atp_tennis.isnull().sum()
unique_values = atp_tennis[['Tournament', 'Series', 'Court', 'Surface', 'Round']].nunique()

data_quality_report = pd.DataFrame({'Missing Values': missing_values, 'Unique Values': unique_values})

print(data_quality_report)
# pickle.dump(data_quality_report,open("./ref_result/data_quality_report.pkl","wb"))

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pickle


# Count the number of wins for each player
player_wins = atp_tennis['Winner'].value_counts()
  
# Count the total number of matches played by each player
player_matches = atp_tennis['Player_1'].value_counts() + atp_tennis['Player_2'].value_counts()
  
# Calculate win/loss ratio for each player
win_loss_ratios = player_wins / (player_matches - player_wins)
  
# Add win/loss ratios to the dataset
atp_tennis['Win_Loss_Ratio_1'] = atp_tennis['Player_1'].map(win_loss_ratios)
atp_tennis['Win_Loss_Ratio_2'] = atp_tennis['Player_2'].map(win_loss_ratios)
  
# Descriptive Statistics Table
desc_stats = atp_tennis[['Rank_1', 'Rank_2', 'Win_Loss_Ratio_1', 'Win_Loss_Ratio_2']].describe()

# Combine Rank_1, Rank_2, Win_Loss_Ratio_1, and Win_Loss_Ratio_2 into single columns
combined_ranks = pd.concat([atp_tennis['Rank_1'], atp_tennis['Rank_2']]).reset_index(drop=True)
combined_win_loss_ratios = pd.concat([atp_tennis['Win_Loss_Ratio_1'], atp_tennis['Win_Loss_Ratio_2']]).reset_index(drop=True)
  
# Create a DataFrame with combined ranks and win/loss ratios
combined_data = pd.DataFrame({'Rank': combined_ranks, 'Win_Loss_Ratio': combined_win_loss_ratios})
  
# Descriptive Statistics Table
desc_stats = combined_data.describe()
  
# Histogram for Player Rankings Distribution
fig1, ax1 = plt.subplots()
ax1.hist(combined_ranks, bins=50, alpha=0.5)
ax1.set_title('Player Rankings Distribution')
ax1.set_xlabel('Ranking')
plt.savefig('./ref_result/hist_chart.png')
# plt.show()
  
# Scatter Plot for Player Rankings vs Win/Loss Ratios
fig2, ax2 = plt.subplots()
ax2.scatter(combined_data['Rank'], combined_data['Win_Loss_Ratio'], alpha=0.5)
ax2.set_title('Player Rankings vs Win/Loss Ratios')
ax2.set_xlabel('Ranking')
ax2.set_ylabel('Win/Loss Ratio')
plt.savefig('./ref_result/scatter_chart.png')
# plt.show()

# Bar Chart for Surface Types
fig3, ax3 = plt.subplots()
surface_counts = atp_tennis['Surface'].value_counts()
surface_counts.plot.bar(ax=ax3)
ax3.set_title('Matches by Surface Type')
ax3.set_xlabel('Surface')
ax3.set_ylabel('Number of Matches')
plt.savefig('./ref_result/bar_chart.png')
# plt.show()

print(desc_stats)
# pickle.dump(desc_stats,open("./ref_result/desc_stats.pkl","wb"))


import numpy as np
import pandas as pd
import scipy.stats as stats
import pickle


# Create a DataFrame with surface types, player rankings, and win/loss ratios
surface_data = atp_tennis[['Surface', 'Rank_1', 'Rank_2', 'Win_Loss_Ratio_1', 'Win_Loss_Ratio_2']].copy()


# Remove rows with missing or invalid data in the Win_Loss_Ratio columns
surface_data_clean = surface_data.dropna(subset=['Win_Loss_Ratio_1', 'Win_Loss_Ratio_2']).copy()


# One-hot encoding for surface types
surface_data_clean = pd.get_dummies(surface_data_clean, columns=['Surface'], prefix='', prefix_sep='')


# Calculate the average player ranking and win/loss ratio for each match
surface_data_clean.loc[:, 'Avg_Rank'] = (surface_data_clean['Rank_1'] + surface_data_clean['Rank_2']) / 2
surface_data_clean.loc[:, 'Avg_Win_Loss_Ratio'] = (surface_data_clean['Win_Loss_Ratio_1'] + surface_data_clean['Win_Loss_Ratio_2']) / 2


# Create a Correlation Matrix
corr_matrix = surface_data_clean[['Hard', 'Clay', 'Grass', 'Avg_Rank', 'Avg_Win_Loss_Ratio']].corr()


# Perform Statistical Tests
anova_result_clean = stats.f_oneway(surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Hard'] == 1],
                                    surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Clay'] == 1],
                                    surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Grass'] == 1])


# Calculate the mean and standard error of the mean for each surface type
hard_mean = surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Hard'] == 1].mean()
hard_sem = stats.sem(surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Hard'] == 1])


clay_mean = surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Clay'] == 1].mean()
clay_sem = stats.sem(surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Clay'] == 1])


grass_mean = surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Grass'] == 1].mean()
grass_sem = stats.sem(surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Grass'] == 1])


# Calculate the confidence intervals using the t-distribution
confidence_level = 0.95
degrees_of_freedom = len(surface_data_clean) - 1
t_critical = stats.t.ppf((1 + confidence_level) / 2, degrees_of_freedom)


hard_ci = (hard_mean - t_critical * hard_sem, hard_mean + t_critical * hard_sem)
clay_ci = (clay_mean - t_critical * clay_sem, clay_mean + t_critical * clay_sem)
grass_ci = (grass_mean - t_critical * grass_sem, grass_mean + t_critical * grass_sem)

print("Correlation Matrix:\n", corr_matrix)
# pickle.dump(corr_matrix,open("./ref_result/corr_matrix.pkl","wb"))
print("P-value:", anova_result_clean.pvalue)
# pickle.dump(anova_result_clean.pvalue,open("./ref_result/pvalue.pkl","wb"))
print("Confidence Intervals:\n", t_critical)
# pickle.dump(t_critical,open("./ref_result/t_critical.pkl","wb"))
print("Hard SurnConfidenceface:", hard_ci)
# pickle.dump(hard_ci,open("./ref_result/hard_ci.pkl","wb"))
print("Clay Surface:", clay_ci)
# pickle.dump(clay_ci,open("./ref_result/clay_ci.pkl","wb"))
print("Grass Surface:", grass_ci)
# pickle.dump(grass_ci,open("./ref_result/grass_ci.pkl","wb"))




import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import pickle


# Create a new DataFrame with separate rows for each player
player_data = pd.DataFrame(columns=['Player', 'Rank', 'Win_Loss_Ratio', 'Surface'])

# Add Player_1 data to the new DataFrame
player_data_1 = atp_tennis[['Player_1', 'Rank_1', 'Win_Loss_Ratio_1', 'Surface']].copy()
player_data_1.columns = ['Player', 'Rank', 'Win_Loss_Ratio', 'Surface']

# Add Player_2 data to the new DataFrame
player_data_2 = atp_tennis[['Player_2', 'Rank_2', 'Win_Loss_Ratio_2', 'Surface']].copy()
player_data_2.columns = ['Player', 'Rank', 'Win_Loss_Ratio', 'Surface']

# Concatenate Player_1 and Player_2 data
player_data = pd.concat([player_data_1, player_data_2], ignore_index=True)

# Group the DataFrame by player and compute the average rank and win/loss ratio for each unique player
unique_player_data = player_data.groupby('Player').agg({'Rank': 'mean', 'Win_Loss_Ratio': 'mean', 'Surface': 'count'}).reset_index()
unique_player_data.columns = ['Player', 'Avg_Rank', 'Avg_Win_Loss_Ratio', 'Match_Count']

# Define ranking groups based on average rank
unique_player_data['Rank_Group'] = pd.cut(unique_player_data['Avg_Rank'], bins=[0, 50, 200, np.inf], labels=['Top-ranked', 'Mid-ranked', 'Low-ranked'])

# Calculate the average win/loss ratio for each ranking group and surface type
grouped_data = player_data.merge(unique_player_data[['Player', 'Rank_Group']], on='Player')
grouped_data = grouped_data.groupby(['Rank_Group', 'Surface']).agg({'Win_Loss_Ratio': 'mean'}).reset_index()

# Create a bar chart comparing win/loss ratios across ranking groups and surface types
plt.figure(figsize=(12, 6))
sns.barplot(x='Surface', y='Win_Loss_Ratio', hue='Rank_Group', data=grouped_data)
plt.title('Win/Loss Ratios by Surface Type and Ranking Group')
plt.ylabel('Average Win/Loss Ratio')
plt.savefig("./ref_result/barplot.png")
# plt.show()

# Perform statistical tests for each ranking group
anova_results = {}
for group in ['Top-ranked', 'Mid-ranked', 'Low-ranked']:
    group_data = player_data.merge(unique_player_data[unique_player_data['Rank_Group'] == group][['Player', 'Rank_Group']], on='Player')
    hard_data = group_data['Win_Loss_Ratio'][group_data['Surface'] == 'Hard'].dropna()
    clay_data = group_data['Win_Loss_Ratio'][group_data['Surface'] == 'Clay'].dropna()
    grass_data = group_data['Win_Loss_Ratio'][group_data['Surface'] == 'Grass'].dropna()

    if len(hard_data) > 0 and len(clay_data) > 0 and len(grass_data) > 0:
        anova_result = stats.f_oneway(hard_data, clay_data, grass_data)
        anova_results[group] = anova_result.pvalue
    else:
        anova_results[group] = None

print(anova_results)
pickle.dump(anova_results,open("./ref_result/anova_results.pkl","wb"))